library(readr)
library(data.table)
library(Hmisc)
library(scales)
library(readxl)
# Landing information merged with corak



################################################################
##########               LOAD PACKAGES               ###########
################################################################

library(haven)
library(readr)
library(data.table)
library(scales)

`%notin%`=Negate(`%in%`)

#: for each row: one unique IMDB_ID - and their FIRST CENSUS TRACT 

# THIS FILE GETS THE FIRST CENSUS TRACT OF THE PERSON FROM THE LANDING FILE


datayears=c(1982:2019)
uniqueid=0

for(i in 1982:2005){
  print(i)
  
  
  if (i<1986){
    censyear="81"
  }
  
  else if (i>=1986 & i<1991){
    censyear="86"
  }
  
  else if (i>=1991 & i <1996){
    
    censyear="91"
  }
  else if (i>=1996 & i<2001){
    censyear="96"
  }
  else if (i>=2001 & i<2006){
    censyear="01"
  }
  else if(i>=2006 & i<2011){
    censyear="06"
  }
  else{
    censyear="16"
  }
  
  tax <- read_dta(paste("G:/IMDB_AllYears/rdc/IMDB_BDIM_2020_v1/data/stata/core_imdb/imdb_t1ff_", i, "_f3_v1.dta", sep=""), col_select=c("IMDB_ID",paste("CMA",censyear,"F",i,sep=""), paste("PR___F",i,sep=""),paste("XCMA",censyear,"F",i,sep=""),paste("CT",censyear,"_F",i,sep=""),paste("CD",censyear,"_F",i,sep=""),paste("XCD",censyear,"_F",i,sep=""),paste("CSD",censyear,"F",i,sep=""),paste("XCSD",censyear,"F",i,sep=""),paste("XCT",censyear,"_F",i,sep="")))
  setnames(tax, old=c("IMDB_ID",paste("CMA",censyear,"F",i,sep=""),paste("XCMA",censyear,"F",i,sep=""),paste("CT",censyear,"_F",i,sep=""),paste("XCT",censyear,"_F",i,sep=""), paste("PR___F",i,sep=""),paste("CD",censyear,"_F",i,sep=""),paste("XCD",censyear,"_F",i,sep=""),paste("CSD",censyear,"F",i,sep=""),paste("XCSD",censyear,"F",i,sep="")),
           new=c("IMDB_ID", "CMA_F", "XCMA_F", "CT_F", "XCT_F","PR_F","CD_F","XCD_F","CSD_F","XCSD_F"))
  tax=tax[tax$IMDB_ID %notin% uniqueid,] # get people who weren't in previous year
  
  
  tax$cens=censyear
  tax$taxyear=i  # taxyear = first year we see them 
  
  # works for 1982, 1983
  
  if(i==1982){
    taxout=tax
  }
  else{
    taxout=rbind(taxout,tax)
  }
  # unique indivividuals here [don't get anyone who is already in the main dataset]
  uniqueid=unique(taxout$IMDB_ID)  
  taxout
  
}

# Fwrite
fwrite(taxout, "H:/Zheng_10223/Joint/Census/landing_tract1982_2005.csv")

# bring in the refugee info 

landing=read_dta("G:/IMDB_AllYears/rdc/IMDB_BDIM_2020_v1/data/stata/core_imdb/pnrf_1980_2020_f3_v1.dta")
cohort <- merge(x=taxout, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="IMDB_ID", by.y="IMDB_ID", all.x=T)
# CREATE REFUGEE CLASS 

cohort$refugee=ifelse(cohort$IMMIGRATION_CATEGORY_CENSUS %in% c('C3110', 'C3120', 'C3210', 'C3220', 'C3230', 'D4110', 'D4120', 'D4210', 'Z9991'),1,0)

# Break up by census years:
cens81=cohort[cohort$cens=="81",]
cens86=cohort[cohort$cens=="86",]
cens91=cohort[cohort$cens=="91",]
cens96=cohort[cohort$cens=="96",]
cens01=cohort[cohort$cens=="01",]



# total : 

length(which(cohort$refugee==1))
length(which(cohort$refugee==0))

# counts: 
cens81count=cens81 %>% group_by(PR_F,CD_F,refugee) %>% summarize(count=n())
cens81count=cens81count %>% filter(count>100)



# counts: 
cens86count=cens81 %>% group_by(PR_F,CD_F,refugee) %>% summarize(count=n()) %>% filter(count>100)


# counts: 
cens91count=cens91 %>% group_by(PR_F,CD_F,refugee) %>% summarize(count=n()) %>% filter(count>100)



# counts: 
cens96count=cens96 %>% group_by(PR_F,CD_F,refugee) %>% summarize(count=n()) %>% filter(count>100)



# counts: 
cens01count=cens01 %>% group_by(PR_F,CD_F,refugee) %>% summarize(count=n()) %>% filter(count>100)



setwd()

corak <- read_csv("~/frac_censusdivision.csv")
corak <- data.table(corak)



census <- rbind(cens81count, cens86count, cens91count, cens96count, cens01count)
census <- data.table(census)

census$CD_F <- as.character(census$CD_F)
census$CD_F[which(nchar(census$CD_F)==1)] <- paste("0", census$CD_F[which(nchar(census$CD_F)==1)], sep="")
census$CD86 <- paste(census$PR_F, census$CD_F, sep="")
census$CD86 <- as.numeric(census$CD86)

refugee <- merge(x=census[refugee==1, .(pop=sum(rounded)), by="CD86"], y=unique(corak[,c("CD86", "up25")]), by.x="CD86", by.y="CD86")
refugee$pop <- refugee$pop/sum(refugee$pop)

nonrefugee <- merge(x=census[refugee==0, .(pop=sum(rounded)), by="CD86"], y=unique(corak[,c("CD86", "up25")]), by.x="CD86", by.y="CD86")
nonrefugee$pop <- nonrefugee$pop/sum(nonrefugee$pop)

refugee_dist <- rep(refugee$up25, refugee$pop*97110)
nonrefugee_dist <- rep(nonrefugee$up25, nonrefugee$pop*331040)
canadian_dist <- rep(corak$up25[which(corak$Category=="Canadian-Born")], corak$frac_imm_cd[which(corak$Category=="Canadian-Born")]*1295323)


# Run one-sided KS-test, clearly reject
immigrant_vs_canadian <- ks.test(canadian_dist, c(refugee_dist, nonrefugee_dist), alternative="less")
refugee_nonreefugee <- ks.test(refugee_dist, nonrefugee_dist, alternative="less")


plot(density(refugee_dist, bw=0.8), col="royalblue", xlim=c(30,56), lwd=5, xlab="Absolute Upward Mobility of Census Division", ylab="Fraction", cex.lab=2, cex.axis=2, main="", ylim=c(0,0.28))
polygon(x=density(refugee_dist, bw=0.8)$x, y=density(refugee_dist, bw=0.8)$y, border=FALSE, col=alpha("royalblue", 0.3))
lines(density(canadian_dist, bw=0.8), col="grey", lwd=5)
polygon(x=density(canadian_dist, bw=0.8)$x, y=density(canadian_dist, bw=0.8)$y, border=FALSE, col=alpha("grey30", 0.3))
lines(density(nonrefugee_dist, bw=0.8), col="darkorange", lwd=5)
polygon(x=density(nonrefugee_dist, bw=0.8)$x, y=density(nonrefugee_dist, bw=0.8)$y, border=FALSE, col=alpha("darkorange", 0.3))
legend(x=29.35, y=0.28, fill=c("grey", "darkorange", "royalblue"), c("Canadian-Born", "Non-Refugee", "Refugee"), cex=1.5, bty="n")

text(x=33, y=0.22, "Kolmogorov-Smirnov Test:", cex=1.5)
text(x=30,y=0.205, expression("H"[0]), cex=1.5)
text(x=30,y=0.19, expression("H"[0]), cex=1.5)

text(x=36.132 ,y=0.205, ": Immigrants < Canadian Born: p-value <0.001", cex=1.5)
text(x=35.8, y=0.19, ": Refugee < Non-Refugees: p-value <0.001", cex=1.5)

